In [7]:
# !pip install joypy
# !pip install eli5
In [8]:
#!pip install bubbly
In [11]:
import warnings
warnings.filterwarnings('ignore')

# for some basic operations
import numpy as np 
import pandas as pd 
import joypy

# for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import plotting
from pandas.plotting import parallel_coordinates

# for interactive visualizations
import plotly
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff

# for animated visualizations
from bubbly.bubbly import bubbleplot

# for providing path
import os

# for modelling
import sklearn
import imblearn

# for model explanation
import shap 
In [13]:
# let's import the data
data = pd.read_csv('insurance_claims.csv')

# let's take a look at the data
data.head()
Out[13]:
months_as_customer age policy_number policy_bind_date policy_state policy_csl policy_deductable policy_annual_premium umbrella_limit insured_zip ... police_report_available total_claim_amount injury_claim property_claim vehicle_claim auto_make auto_model auto_year fraud_reported _c39
0 328 48 521585 2014-10-17 OH 250/500 1000 1406.91 0 466132 ... YES 71610 6510 13020 52080 Saab 92x 2004 Y NaN
1 228 42 342868 2006-06-27 IN 250/500 2000 1197.22 5000000 468176 ... ? 5070 780 780 3510 Mercedes E400 2007 Y NaN
2 134 29 687698 2000-09-06 OH 100/300 2000 1413.14 5000000 430632 ... NO 34650 7700 3850 23100 Dodge RAM 2007 N NaN
3 256 41 227811 1990-05-25 IL 250/500 2000 1415.74 6000000 608117 ... NO 63400 6340 6340 50720 Chevrolet Tahoe 2014 Y NaN
4 228 44 367455 2014-06-06 IL 500/1000 1000 1583.91 6000000 610706 ... NO 6500 1300 650 4550 Accura RSX 2009 N NaN

5 rows × 40 columns

In [14]:
# let's take a look at the sample of the data

data.sample(5)
Out[14]:
months_as_customer age policy_number policy_bind_date policy_state policy_csl policy_deductable policy_annual_premium umbrella_limit insured_zip ... police_report_available total_claim_amount injury_claim property_claim vehicle_claim auto_make auto_model auto_year fraud_reported _c39
343 334 47 156694 2001-05-24 IL 500/1000 500 1238.89 0 600561 ... NO 6240 960 960 4320 Ford Fusion 2011 N NaN
688 290 47 885789 2008-07-21 IN 250/500 1000 1393.34 0 472922 ... YES 56160 6240 12480 37440 Audi A5 2002 N NaN
465 33 33 758740 1997-08-04 IL 500/1000 1000 1096.79 6000000 446898 ... ? 81400 8140 8140 65120 BMW M5 1998 N NaN
23 413 55 115399 1991-02-08 IN 100/300 2000 1268.79 0 453148 ... ? 98160 8180 16360 73620 Dodge RAM 2011 Y NaN
888 381 55 963761 1991-04-13 OH 500/1000 500 1459.99 0 445856 ... YES 60600 12120 6060 42420 Accura TL 2011 N NaN

5 rows × 40 columns

In [16]:
# let's check whether the data has any null values or not.
# but there are '?' in the datset which we have to remove by NaN Values
data = data.replace('?',np.NaN)

data.isnull().any()
Out[16]:
months_as_customer             False
age                            False
policy_number                  False
policy_bind_date               False
policy_state                   False
policy_csl                     False
policy_deductable              False
policy_annual_premium          False
umbrella_limit                 False
insured_zip                    False
insured_sex                    False
insured_education_level        False
insured_occupation             False
insured_hobbies                False
insured_relationship           False
capital-gains                  False
capital-loss                   False
incident_date                  False
incident_type                  False
collision_type                  True
incident_severity              False
authorities_contacted          False
incident_state                 False
incident_city                  False
incident_location              False
incident_hour_of_the_day       False
number_of_vehicles_involved    False
property_damage                 True
bodily_injuries                False
witnesses                      False
police_report_available         True
total_claim_amount             False
injury_claim                   False
property_claim                 False
vehicle_claim                  False
auto_make                      False
auto_model                     False
auto_year                      False
fraud_reported                 False
_c39                            True
dtype: bool
In [17]:
# missing value treatment using fillna
# we will replace the '?' by the most common collision type as we are unaware of the type.
data['collision_type'].fillna(data['collision_type'].mode()[0], inplace = True)

# It may be the case that there are no responses for property damage then we might take it as No property damage.
data['property_damage'].fillna('NO', inplace = True)

# again, if there are no responses fpr police report available then we might take it as No report available
data['police_report_available'].fillna('NO', inplace = True)

data.isnull().any().any()
Out[17]:
True
In [20]:
data.drop(['_c39'], axis=1, inplace=True)

Data Visualizations¶

In [21]:
fraud = data['fraud_reported'].value_counts()

label_fraud = fraud.index
size_fraud = fraud.values

colors = ['silver', 'gold']
trace = go.Pie(
         labels = label_fraud, values = size_fraud, marker = dict(colors = colors), name = 'Frauds', hole = 0.3)


df = [trace]

layout = go.Layout(
           title = 'Distribution of Frauds')

fig = go.Figure(data = df, layout = layout)

py.iplot(fig)
In [22]:
fig, axes = joypy.joyplot(data,
                         column = ['incident_hour_of_the_day','number_of_vehicles_involved', 'witnesses'],
                         by = 'incident_city',
                         ylim = 'own',
                         figsize = (20, 10),
                         alpha = 0.5, 
                         legend = True)

plt.title('Incident hour, No. of vehicles, witnesses vs Incident City', fontsize = 20)
plt.show()
In [23]:
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)

sns.stripplot(data['property_damage'], data['property_claim'], palette = 'bone')
plt.title('Incident Type vs Vehicle Claim', fontsize = 20)
plt.show()
In [24]:
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)

sns.boxenplot(data['incident_type'], data['vehicle_claim'], palette = 'pink')
plt.title('Incident Type vs Vehicle Claim', fontsize = 20)
plt.show()
In [25]:
incident = pd.crosstab(data['incident_city'], data['incident_type'])
colors = plt.cm.Blues(np.linspace(0, 1, 5))
incident.div(incident.sum(1).astype(float), axis = 0).plot(kind = 'bar',
                                                           stacked = False,
                                                           figsize = (15, 7),
                                                           color = colors)

plt.title('Incident Type vs Collision Type', fontsize = 20)
plt.legend()
plt.show()
In [26]:
incident = pd.crosstab(data['incident_type'], data['incident_severity'])
colors = plt.cm.summer(np.linspace(0, 1, 5))
incident.div(incident.sum(1).astype(float), axis = 0).plot(kind = 'bar',
                                                           stacked = False,
                                                           figsize = (15, 7),
                                                           color = colors)

plt.title('Incident Type vs Collision Type', fontsize = 20)
plt.legend()
plt.show()
In [27]:
incident = pd.crosstab(data['incident_type'], data['collision_type'])
colors = plt.cm.inferno(np.linspace(0, 1, 5))
incident.div(incident.sum(1).astype(float), axis = 0).plot(kind = 'bar',
                                                           stacked = True,
                                                           figsize = (15, 7),
                                                           color = colors)

plt.title('Incident Type vs Collision Type', fontsize = 20)
plt.legend()
plt.show()
In [28]:
# let's check the insured hobbies

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)

sns.countplot(data['insured_occupation'], palette = 'PuRd')
plt.title('Different Types of Occupation of Insured Customers', fontsize = 20)
plt.xticks(rotation = 90)
plt.show()
In [29]:
# let's check the insured hobbies

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)

sns.countplot(data['insured_hobbies'], palette = 'cool')
plt.title('Different Types of Hobbies of Insured Customers', fontsize = 20)
plt.xticks(rotation = 90)
plt.show() 
In [30]:
# let's check the incident types

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)

sns.countplot(data['incident_type'], palette = 'spring')
plt.title('Different Types of Incidents', fontsize = 20)
plt.show()
In [31]:
# swarm plot

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (15, 8)

sns.swarmplot(data['policy_state'], data['total_claim_amount'], palette = 'copper')
plt.title('Policy State vs Total Claim Amount', fontsize = 20)
plt.show()
In [32]:
# Plot
plt.figure(figsize=(20, 10), dpi= 80)

parallel_coordinates(data[['total_claim_amount','injury_claim', 'property_claim','vehicle_claim','fraud_reported']],
                     'fraud_reported',  colormap = 'copper')

# Lighten borders
plt.gca().spines["top"].set_alpha(0)
plt.gca().spines["bottom"].set_alpha(.3)
plt.gca().spines["right"].set_alpha(0)
plt.gca().spines["left"].set_alpha(.3)

plt.title('DC', fontsize = 20)
plt.grid(alpha=0.3)


plt.suptitle('total claim, Injury claim, Property claim, vehicle claim vs Fraud Reported', fontsize = 20)
plt.show()
In [34]:
data.head()
Out[34]:
months_as_customer age policy_number policy_bind_date policy_state policy_csl policy_deductable policy_annual_premium umbrella_limit insured_zip ... witnesses police_report_available total_claim_amount injury_claim property_claim vehicle_claim auto_make auto_model auto_year fraud_reported
0 328 48 521585 2014-10-17 OH 250/500 1000 1406.91 0 466132 ... 2 YES 71610 6510 13020 52080 Saab 92x 2004 Y
1 228 42 342868 2006-06-27 IN 250/500 2000 1197.22 5000000 468176 ... 0 NO 5070 780 780 3510 Mercedes E400 2007 Y
2 134 29 687698 2000-09-06 OH 100/300 2000 1413.14 5000000 430632 ... 3 NO 34650 7700 3850 23100 Dodge RAM 2007 N
3 256 41 227811 1990-05-25 IL 250/500 2000 1415.74 6000000 608117 ... 2 NO 63400 6340 6340 50720 Chevrolet Tahoe 2014 Y
4 228 44 367455 2014-06-06 IL 500/1000 1000 1583.91 6000000 610706 ... 1 NO 6500 1300 650 4550 Accura RSX 2009 N

5 rows × 39 columns

In [35]:
data = data.sort_values(by='auto_year')
In [36]:
import warnings
warnings.filterwarnings('ignore')

figure = bubbleplot(dataset = data, x_column = 'policy_annual_premium', y_column = 'total_claim_amount', 
    bubble_column = 'insured_sex', time_column = 'auto_year', size_column = 'months_as_customer', color_column = 'insured_sex', 
    x_title = "Annual Policy Premium", y_title = "Total Claim Amount", title = 'Annual Premium vs Total Claim Amount vs Months as Customer',
    x_logscale = False, scale_bubble = 3, height = 650)

py.iplot(figure, config={'scrollzoom': True})
In [37]:
trace = go.Histogram(
          x = data['insured_education_level'],
          name = 'Marvel',
          opacity = 0.75,
          marker = dict(
                 color = 'rgb(195, 195, 145, 0.5)'
          )
)
df = [trace]

layout = go.Layout(
    title = 'Education Level of the Customers')

fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
In [38]:
trace = go.Histogram(
          x = data['insured_occupation'],
          name = 'Marvel',
          opacity = 0.75,
          marker = dict(
                 color = 'rgb(15, 255, 185, 0.5)'
          )
)
df = [trace]

layout = go.Layout(
    title = 'Occupation of the Customers')

fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
In [39]:
sex = data['insured_sex'].value_counts()
rel = data['insured_relationship'].value_counts()

label_sex = sex.index
size_sex = sex.values

label_rel = rel.index
size_rel = rel.values

colors = ['aqua', 'gold']
trace = go.Pie(
         labels = label_sex, values = size_sex, marker = dict(colors = colors), name = 'Gender', hole = 0.3)

colors2 = ['pink', 'lightblue','lightgreen','grey','red']
trace2 = go.Pie(labels = label_rel, values = size_rel, marker = dict(colors = colors2), name = 'Relationship',
                hole = 0.3)

df = [trace]
df2 = [trace2]

layout1 = go.Layout(
           title = 'Gender of the Customers')
layout2 = go.Layout(
           title = 'Relationship')

fig = go.Figure(data = df, layout = layout1)
fig2 = go.Figure(data = df2, layout = layout2)
py.iplot(fig)
py.iplot(fig2)
In [40]:
trace = go.Violin(
          x = data['insured_sex'],
          y = data['insured_zip'],
          name = 'Gender vs Insured Zip',
          opacity = 0.75,
          marker = dict(
                 color = 'rgb(215, 5, 185, 0.5)'
          )
)
df = [trace]

layout = go.Layout(
    title = 'Gender vs Insured Zip')

fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
In [41]:
trace = go.Box(
          x = data['auto_make'],
          y = data['vehicle_claim'],
          opacity = 0.7,
          marker = dict(
                 color = 'rgb(215, 195, 5, 0.5)'
          )
)
df = [trace]

layout = go.Layout(
    title = 'Automobile Company vs Vehicle Claim')

fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
In [42]:
trace = go.Histogram(
          x = data['policy_annual_premium'],
          
          #fill = 'tozeroy',
          marker = dict(
                 color = 'rgb(100, 75, 25, 0.5)'
          )
)
df = [trace]

layout = go.Layout(
    title = 'Distribution of Annual Policy among the Customers',
    scene = dict(
            xaxis = dict(title  = 'Age'),
            yaxis = dict(title  = 'Count')
        ))

fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
In [43]:
trace = go.Histogram(
          x = data['age'],
          
          #fill = 'tozeroy',
          marker = dict(
                 color = 'rgb(215, 245, 5, 0.5)'
          )
)
df = [trace]

layout = go.Layout(
    title = 'Distribution of Age among the Customers',
    scene = dict(
            xaxis = dict(title  = 'Age'),
            yaxis = dict(title  = 'Count')
        ))

fig = go.Figure(data = df, layout = layout)
py.iplot(fig)
In [44]:
trace = go.Scatter3d(
    x = data['age'],
    y = data['property_claim'],
    z = data['vehicle_claim'],
    mode = 'markers',
    marker = dict(
         size = 10,
         color = data['age']
    )
)

df = [trace]

layout = go.Layout(
    title = 'Cholestrol vs Heart Rate vs Age',
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0  
    ),
    scene = dict(
            xaxis = dict(title  = 'Age'),
            yaxis = dict(title  = 'Property_claim'),
            zaxis = dict(title  = 'Vehicle_claim')
        )
    
)
fig = go.Figure(data = df, layout=layout)
py.iplot(fig)

Data Processing¶

In [45]:
# let's extrat days, month and year from policy bind date

data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'], errors = 'coerce')
In [46]:
# let's encode the fraud report to numerical values

data['fraud_reported'] = data['fraud_reported'].replace(('Y','N'),(0,1))

# checking the values of fraud reported
# data['fraud_reported'].value_counts()
In [47]:
# let's check the correlation of authorities_contacted with the target

data[['auto_model','fraud_reported']].groupby(['auto_model'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[47]:
auto_model fraud_reported
0 3 Series 0.944444
31 RSX 0.916667
25 Malibu 0.900000
36 Wrangler 0.880952
29 Pathfinder 0.870968
35 Ultima 0.869565
9 Camry 0.857143
11 Corolla 0.850000
8 CRV 0.850000
21 Legacy 0.843750
27 Neon 0.837838
3 95 0.814815
33 TL 0.800000
2 93 0.800000
23 MDX 0.777778
6 Accord 0.769231
17 Grand Cherokee 0.760000
13 Escape 0.750000
12 E400 0.740741
4 A3 0.729730
18 Highlander 0.727273
28 Passat 0.727273
1 92x 0.714286
20 Jetta 0.714286
16 Fusion 0.714286
15 Forrestor 0.714286
26 Maxima 0.708333
19 Impreza 0.700000
37 X5 0.695652
30 RAM 0.674419
22 M5 0.666667
5 A5 0.656250
10 Civic 0.636364
14 F150 0.629630
34 Tahoe 0.625000
7 C300 0.611111
24 ML350 0.600000
32 Silverado 0.590909
38 X6 0.562500
In [48]:
# let's perform target encoding for auto make

data['auto_make'] = data['auto_make'].replace(('3 Series','RSX','Malibu','Wrangler','Pathfinder','Ultima','Camry',
                'Corolla','CRV','Legacy','Neon','95','TL','93','MDX','Accord','Grand Cherokee','Escape','E4000',
            'A3','Highlander','Passat','92x','Jetta','Fusion','Forrestor','Maxima','Impreza','X5','RAM','M5','A5',
                'Civic','F150','Tahaoe','C300','ML350','Silverado','X6'),
                (0.95,0.91, 0.90,0.88,0.87,0.86,0.855,0.85,0.85,0.84,0.83,0.81,0.80,0.80,0.78,0.77,0.76,0.75,0.74,
                 0.73,0.72,0.72,0.71,0.71,0.71,0.71,0.70,0.70,0.69,0.67,0.66,0.65,0.64,0.63,0.62,0.61,0.60,0.59,0.56))

# let's check the values
# data['auto_make'].value_counts()
In [49]:
# let's check the correlation auto make with the target

data[['auto_make','fraud_reported']].groupby(['auto_make'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[49]:
auto_make fraud_reported
7 Jeep 0.835821
9 Nissan 0.820513
12 Toyota 0.814286
0 Accura 0.808824
10 Saab 0.775000
11 Suburu 0.762500
4 Dodge 0.750000
6 Honda 0.745455
3 Chevrolet 0.723684
2 BMW 0.722222
13 Volkswagen 0.720588
1 Audi 0.695652
5 Ford 0.694444
8 Mercedes 0.661538
In [50]:
# let's perform target encoding for auto make

data['auto_make'] = data['auto_make'].replace(('Jeep','Nissan','Toyota','Accura','Saab','Suburu',
                                'Dodge','Honda','Chevrolet','BMW','Volkswagen','Audi','Ford','Mercedes'),
                                              (0.84,0.82,0.81,0.80,0.77,0.76,0.75,0.74,0.73,0.72,0.71,0.69,0.69,0.66))

# let's check the values
# data['auto_make'].value_counts()
In [51]:
# let's check the correlation of authorities_contacted with the target

data[['police_report_available','fraud_reported']].groupby(['police_report_available'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[51]:
police_report_available fraud_reported
1 YES 0.770701
0 NO 0.744898
In [52]:
# let's perform target encoding for property damage

data['police_report_available'] = data['police_report_available'].replace(('NO','YES'),(0.77,0.74))

# let's check the values
# data['police_report_available'].value_counts()
In [53]:
# let's check the correlation of authorities_contacted with the target

data[['property_damage','fraud_reported']].groupby(['property_damage'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[53]:
property_damage fraud_reported
0 NO 0.757880
1 YES 0.741722
In [54]:
# let's perform target encoding for property damage

data['property_damage'] = data['property_damage'].replace(('NO','YES'),(0.76,0.74))

# let's check the values
# data['property_damage'].value_counts()
In [55]:
# let's check the correlation of authorities_contacted with the target

data[['incident_city','fraud_reported']].groupby(['incident_city'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[55]:
incident_city fraud_reported
4 Northbrook 0.778689
5 Riverwood 0.776119
3 Northbend 0.765517
6 Springfield 0.757962
2 Hillsdale 0.751773
1 Columbus 0.738255
0 Arlington 0.710526
In [56]:
# let's do target encoding for incident city

data['incident_city'] = data['incident_city'].replace(('Northbrook','Riverwood','Northbend','Springfield',
                                    'Hillsdale','Columbus','Arlington'),(0.78,0.77,0.76,0.75,0.74,0.73,0.71))

# let's check the values
# data['incident_city'].value_counts()
In [57]:
# let's check the correlation of authorities_contacted with the target

data[['incident_state','fraud_reported']].groupby(['incident_state'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[57]:
incident_state fraud_reported
6 WV 0.820276
1 NY 0.778626
5 VA 0.772727
3 PA 0.733333
4 SC 0.705645
0 NC 0.690909
2 OH 0.565217
In [58]:
# let's perform target encoding for incident state

data['incident_state'] = data['incident_state'].replace(('WV','NY','VA','PA','SC','NC','OH'),
                                                        (0.82,0.77,0.76,0.73,0.70,0.69,0.56))

# checking the values
# data['incident_state'].value_counts()
In [59]:
# let's check the correlation of authorities_contacted with the target

data[['authorities_contacted','fraud_reported']].groupby(['authorities_contacted'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[59]:
authorities_contacted fraud_reported
2 None 0.934066
4 Police 0.791096
1 Fire 0.730942
0 Ambulance 0.709184
3 Other 0.681818
In [60]:
# let's perform target encoding for authorities contacted

data['authorities_contacted'] = data['authorities_contacted'].replace(('None','Police','Fire','Ambulance','Other'),
                                                                      (0.94,0.79,0.73,0.70,0.68))

# let's check the values
#data['authorities'].value_counts()
In [61]:
# let's check the correlation of incident_severity with the target

data[['incident_severity','fraud_reported']].groupby(['incident_severity'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[61]:
incident_severity fraud_reported
3 Trivial Damage 0.933333
1 Minor Damage 0.892655
2 Total Loss 0.871429
0 Major Damage 0.394928
In [62]:
# let's perform target encoding for incident severity

data['incident_severity'] = data['incident_severity'].replace(('Trivial Damage','Minor Damage','Total Loss',
                                                              'Major Damage'),(0.94,0.89,0.87,0.39))

# let's check the values
# data['incident_severity'].value_counts()
In [63]:
# let's check the correlation of collision_type with the target

data[['collision_type','fraud_reported']].groupby(['collision_type'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[63]:
collision_type fraud_reported
1 Rear Collision 0.772340
2 Side Collision 0.746377
0 Front Collision 0.724409
In [64]:
# let's perform target encoding for collision type

data['collision_type'] = data['collision_type'].replace(('Rear Collision', 'Side Collision', 'Front Collision'),
                                                        (0.78,0.74,0.72))

# let's check the values of collision type
# data['collision_type'].value_counts()
In [65]:
# let's check the correlation of incident_type with the target

data[['incident_type','fraud_reported']].groupby(['incident_type'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[65]:
incident_type fraud_reported
3 Vehicle Theft 0.914894
1 Parked Car 0.904762
0 Multi-vehicle Collision 0.727924
2 Single Vehicle Collision 0.709677
In [66]:
# let's perform target encoing for incident type

data['incident_type'] = data['incident_type'].replace(('Vehicle Theft','Parked Car','Multi-vehicle Collision',
                                'Single Vehicle Collision'),(0.91, 0.90, 0.72,0.70))

# let's check the values
#data['incident_type'].value_counts()
In [67]:
data['incident_date'] = pd.to_datetime(data['incident_date'], errors = 'coerce')

# extracting days and month from date
data['incident_month'] = data['incident_date'].dt.month
data['incident_day'] = data['incident_date'].dt.day
In [68]:
# let's know the relation between insured_relationship and fraud reported

data[['insured_relationship','fraud_reported']].groupby(['insured_relationship'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[68]:
insured_relationship fraud_reported
0 husband 0.794118
3 own-child 0.786885
4 unmarried 0.758865
1 not-in-family 0.741379
5 wife 0.729032
2 other-relative 0.706215
In [69]:
# let's do target encoding for insured relationship

data['insured_relationship'] = data['insured_relationship'].replace(('husband','own-child','unmarried',
                                        'not-in-family','wife','other-relative'),(0.79,0.78,0.75,0.74,0.72,0.70))

#data['insured-relationship'].value_counts()
In [70]:
# let's know the relation between insured_hobbies and fraud reported

data[['insured_hobbies','fraud_reported']].groupby(['insured_hobbies'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[70]:
insured_hobbies fraud_reported
4 camping 0.909091
11 kayaking 0.907407
9 golf 0.890909
7 dancing 0.883721
3 bungie-jumping 0.839286
12 movies 0.836364
1 basketball 0.823529
8 exercise 0.807018
17 sleeping 0.804878
18 video-games 0.800000
16 skydiving 0.775510
13 paintball 0.771930
10 hiking 0.769231
0 base-jumping 0.734694
15 reading 0.734375
14 polo 0.723404
2 board-games 0.708333
19 yachting 0.698113
6 cross-fit 0.257143
5 chess 0.173913
In [71]:
# let's perform target encoding for insured_hobbies

data['insured_hobbies'] = data['insured_hobbies'].replace(('camping', 'kayaking', 'golf','dancing',
        'bungie-jumping','movies', 'basketball','exercise','sleeping','video-games','skydiving','paintball',
            'hiking','base-jumping','reading','polo','board-games','yachting', 'cross-fit','chess'),(0.91, 0.90,
                0.89, 0.88,0.84,0.83,0.82,0.81,0.805,0.80,0.78,0.77,0.76,0.73,0.73,0.72,0.70,0.69,0.25,0.17))

#data['insured_hobbies'].value_counts()
In [72]:
# let's know the relation between insured_occupation and fraud reported

data[['insured_occupation','fraud_reported']].groupby(['insured_occupation'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[72]:
insured_occupation fraud_reported
7 other-service 0.830986
8 priv-house-serv 0.830986
0 adm-clerical 0.830769
5 handlers-cleaners 0.796296
9 prof-specialty 0.788235
10 protective-serv 0.777778
6 machine-op-inspct 0.763441
1 armed-forces 0.753623
11 sales 0.723684
12 tech-support 0.717949
13 transport-moving 0.708333
2 craft-repair 0.702703
4 farming-fishing 0.698113
3 exec-managerial 0.631579
In [73]:
# let's perform target encoding for insured_occupation

data['insured_occupation'] = data['insured_occupation'].replace(('other-service','priv-house-serv',
                        'adm-clerical','handlers-cleaners','prof-specialty','protective-serv',
                'machine-op-inspct','armed-forces','sales','tech-support','transport-moving','craft-repair',
                    'farming-fishing','exec-managerial'),(0.84, 0.84,0.83, 0.79,0.78,0.77,0.76,0.75,0.72,0.71,
                                                          0.705,0.70,0.69,0.63))
# data['insured_occupation'].value_counts()
In [74]:
# let's know the relation of insured_education_level with faud_reported

data[['insured_education_level','fraud_reported']].groupby(['insured_education_level'], 
                as_index = False).mean().sort_values(by = 'fraud_reported', ascending = False)
Out[74]:
insured_education_level fraud_reported
5 Masters 0.776224
2 High School 0.775000
0 Associate 0.765517
3 JD 0.739130
1 College 0.737705
4 MD 0.736111
6 PhD 0.736000
In [75]:
# let's perform target encoding

data['insured_education_level'] = data['insured_education_level'].replace(('Masters', 'High School','Associate',
                                        'JD','College', 'MD','PhD'),(0.78,0.77,0.76,0.74,0.73,0.72,0.71))
#data['insured_education_level'].value_counts()
In [76]:
# lets know the relation of insured sex and fraud reported

data[['insured_sex','fraud_reported']].groupby(['insured_sex'], as_index = False).mean().sort_values(
    by = 'fraud_reported', ascending = False)
Out[76]:
insured_sex fraud_reported
0 FEMALE 0.765363
1 MALE 0.738661
In [77]:
# target encoding for sex

data['insured_sex'] = data['insured_sex'].replace(('FEMALE','MALE'),(0.76,0.73))
#data['insured_sex'].value_counts()
In [78]:
# csl - combined single limit

'''CSL is a single number that describes the predetermined limit for the combined total of the Bodily Injury 
Liability coverage and Property Damage Liability coverage per occurrence or accident.'''

# lets know the relation of policy state and fraud reported

data[['policy_csl','fraud_reported']].groupby(['policy_csl'], as_index = False).mean().sort_values(
    by = 'fraud_reported', ascending = False)
Out[78]:
policy_csl fraud_reported
2 500/1000 0.783333
0 100/300 0.742120
1 250/500 0.737892
In [79]:
# target encoding for policy_csl

data['policy_csl'] = data['policy_csl'].replace(('500/1000','100/300','250/500'),(0.78,0.74,0.73))

# check the values
# data['policy_csl'].value_counts()
In [80]:
# lets know the relation of policy state and fraud reported

data[['policy_state','fraud_reported']].groupby(['policy_state'], as_index = False).mean().sort_values(
    by = 'fraud_reported', ascending = False)
Out[80]:
policy_state fraud_reported
0 IL 0.772189
1 IN 0.745161
2 OH 0.741477
In [81]:
# target encoding for policy_csl

data['policy_state'] = data['policy_state'].replace(('IL','IN','OH'),(0.77,0.745,0.74))

# check the values
# data['policy_state'].value_counts()
In [82]:
# let's delete unnecassary columns

data = data.drop(['policy_number','policy_bind_date', 'incident_date','incident_location','auto_model'], axis = 1)

# let's check the columns after deleting the columns
data.columns
Out[82]:
Index(['months_as_customer', 'age', 'policy_state', 'policy_csl',
       'policy_deductable', 'policy_annual_premium', 'umbrella_limit',
       'insured_zip', 'insured_sex', 'insured_education_level',
       'insured_occupation', 'insured_hobbies', 'insured_relationship',
       'capital-gains', 'capital-loss', 'incident_type', 'collision_type',
       'incident_severity', 'authorities_contacted', 'incident_state',
       'incident_city', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_year', 'fraud_reported', 'incident_month', 'incident_day'],
      dtype='object')
In [83]:
# let's split the data into dependent and independent sets

x = data.drop(['fraud_reported'], axis = 1)
y = data['fraud_reported']

print("Shape of x :", x.shape)
print("Shape of y :", y.shape)
Shape of x : (1000, 35)
Shape of y : (1000,)
In [84]:
# let's split the dataset into train and test sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

print("Shape of x_train :", x_train.shape)
print("Shape of x_test :", x_test.shape)
print("Shape of y_train :", y_train.shape)
print("Shape of y_test :", y_test.shape)
Shape of x_train : (800, 35)
Shape of x_test : (200, 35)
Shape of y_train : (800,)
Shape of y_test : (200,)
In [85]:
plt.rcParams['figure.figsize'] = (15, 10)
sns.heatmap(x_train.corr(), cmap = 'copper')
plt.title('Heat Map for Correlations', fontsize = 20)
plt.show()

Modelling with Ensemble of Samplers¶

Random Forest Classifier

In [86]:
# Random Forest Classifier

from imblearn.ensemble import BalancedRandomForestClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
                 

model = BalancedRandomForestClassifier(n_estimators = 100, random_state = 0)

model.fit(x_train, y_train)
y_pred_rf = model.predict(x_test)

print("Training Accuracy: ", model.score(x_train, y_train))
print('Testing Accuarcy: ', model.score(x_test, y_test))

# making a classification report
cr = classification_report(y_test,  y_pred_rf)
print(cr)

# making a confusion matrix
plt.rcParams['figure.figsize'] = (5, 5)
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot = True, cmap = 'spring')
plt.show()
Training Accuracy:  0.91375
Testing Accuarcy:  0.86
              precision    recall  f1-score   support

           0       0.65      0.87      0.74        46
           1       0.96      0.86      0.90       154

    accuracy                           0.86       200
   macro avg       0.80      0.86      0.82       200
weighted avg       0.88      0.86      0.87       200

Easy Ensemble Classifier

In [87]:
# Easy Ensemble Classifier

from imblearn.ensemble import EasyEnsembleClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
                 

model1 = EasyEnsembleClassifier(n_estimators = 100, random_state = 0)

model1.fit(x_train, y_train)
y_pred_ef = model1.predict(x_test)

print("Training Accuracy: ", model1.score(x_train, y_train))
print('Testing Accuarcy: ', model1.score(x_test, y_test))

# making a classification report
cr = classification_report(y_test,  y_pred_ef)
print(cr)

# making a confusion matrix
cm = confusion_matrix(y_test, y_pred_ef)
sns.heatmap(cm, annot = True, cmap = 'copper')
plt.show()
Training Accuracy:  0.8425
Testing Accuarcy:  0.825
              precision    recall  f1-score   support

           0       0.58      0.85      0.69        46
           1       0.95      0.82      0.88       154

    accuracy                           0.82       200
   macro avg       0.76      0.83      0.78       200
weighted avg       0.86      0.82      0.83       200

Bagging Classifier

In [88]:
# Random Forest with Bagging Classifier

from imblearn.ensemble import BalancedBaggingClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
                 

model2 = BalancedBaggingClassifier(base_estimator = RandomForestClassifier(),
                                 sampling_strategy = 'auto',
                                 replacement = False,
                                 random_state = 0)

model2.fit(x_train, y_train)
y_pred_bc = model2.predict(x_test)

print("Training Accuracy: ", model2.score(x_train, y_train))
print('Testing Accuarcy: ', model2.score(x_test, y_test))

# making a classification report
cr = classification_report(y_test,  y_pred_bc)
print(cr)

# making a confusion matrix
cm = confusion_matrix(y_test, y_pred_bc)
sns.heatmap(cm, annot = True, cmap = 'Purples')
plt.show()
Training Accuracy:  0.92625
Testing Accuarcy:  0.855
              precision    recall  f1-score   support

           0       0.65      0.80      0.72        46
           1       0.94      0.87      0.90       154

    accuracy                           0.85       200
   macro avg       0.79      0.84      0.81       200
weighted avg       0.87      0.85      0.86       200

Boosting the Predictions of above Models

In [89]:
# boosting

y_pred = y_pred_rf*0.5 + y_pred_ef*0.2 + y_pred_bc*0.3

y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0

# making a classification report
cr = classification_report(y_test,  y_pred)
print(cr)

# making a confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot = True, cmap = 'Reds')
plt.show()
              precision    recall  f1-score   support

           0       0.65      0.87      0.74        46
           1       0.96      0.86      0.90       154

    accuracy                           0.86       200
   macro avg       0.80      0.86      0.82       200
weighted avg       0.88      0.86      0.87       200

Voting Classifier

In [90]:
from sklearn.ensemble import VotingClassifier

vote_est = [ 
    ('brf', BalancedRandomForestClassifier()),
    ('bc', BalancedBaggingClassifier()),
    ('eec',EasyEnsembleClassifier())]

voting = VotingClassifier(estimators = vote_est , voting = 'soft')
voting.fit(x_train, y_train)

y_pred = voting.predict(x_test).astype(int)

# making a classification report
cr = classification_report(y_test,  y_pred)
print(cr)

# making a confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot = True, cmap = 'magma')
plt.show()
              precision    recall  f1-score   support

           0       0.67      0.89      0.77        46
           1       0.96      0.87      0.91       154

    accuracy                           0.88       200
   macro avg       0.82      0.88      0.84       200
weighted avg       0.90      0.88      0.88       200

Applying Sampling Techniques¶

Under Sampling

In [91]:
y.value_counts()
Out[91]:
1    753
0    247
Name: fraud_reported, dtype: int64
In [92]:
frauds = np.array(data[data['fraud_reported'] == 0].index)
no_frauds = len(frauds)

print(no_frauds)
247
In [93]:
normal_indices = data[data['fraud_reported'] == 1]
no_normal_indices = len(normal_indices)

print(no_normal_indices)
753
In [94]:
random_normal_indices = np.random.choice(no_normal_indices, size = no_frauds, replace = True)
random_normal_indices = np.array(random_normal_indices)

print(len(random_normal_indices))
247
In [95]:
under_sample = np.concatenate([frauds, random_normal_indices])
print(len(under_sample))
494
In [96]:
# creating the undersample data

undersample_data = data.iloc[under_sample, :]
In [97]:
# splitting the undersample dataset into x and y sets

x_u = undersample_data.iloc[:, undersample_data.columns != 'fraud_reported'] 
y_u = undersample_data.iloc[:, undersample_data.columns == 'fraud_reported']

print(x_u.shape)
print(y_u.shape)
(494, 35)
(494, 1)
In [98]:
from sklearn.model_selection import train_test_split

x_train1, x_test1, y_train1, y_test1 = train_test_split(x_u, y_u, test_size = 0.2, random_state = 0)

print(x_train1.shape)
print(y_train1.shape)
print(x_test1.shape)
(395, 35)
(395, 1)
(99, 35)
In [99]:
# standardization

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train1 = sc.fit_transform(x_train1)
x_test1 = sc.transform(x_test1)
In [100]:
from sklearn.ensemble import RandomForestClassifier

model_u = RandomForestClassifier()
model_u.fit(x_train1, y_train1)

y_pred = model_u.predict(x_test1)

print("Training Accuracy: ", model_u.score(x_train1, y_train1))
print('Testing Accuarcy: ', model_u.score(x_test1, y_test1))

# confusion matrix
cm = confusion_matrix(y_test1, y_pred)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, cmap = 'winter')
plt.show()

# classification report
cr = classification_report(y_test1, y_pred)
print(cr)
Training Accuracy:  1.0
Testing Accuarcy:  0.8484848484848485
              precision    recall  f1-score   support

           0       0.78      0.35      0.48        20
           1       0.86      0.97      0.91        79

    accuracy                           0.85        99
   macro avg       0.82      0.66      0.70        99
weighted avg       0.84      0.85      0.82        99

Over Sampling with SMOTE

In [101]:
from imblearn.over_sampling import SMOTE

x_resample, y_resample  = SMOTE().fit_sample(x, y.values.ravel())

print(x_resample.shape)
print(y_resample.shape)
-----------------------------------------------------------
AttributeError            Traceback (most recent call last)
Input In [101], in <cell line: 3>()
      1 from imblearn.over_sampling import SMOTE
----> 3 x_resample, y_resample  = SMOTE().fit_sample(x, y.values.ravel())
      5 print(x_resample.shape)
      6 print(y_resample.shape)

AttributeError: 'SMOTE' object has no attribute 'fit_sample'
In [ ]:
from sklearn.model_selection import train_test_split

x_train2, x_test2, y_train2, y_test2 = train_test_split(x_resample, y_resample, test_size = 0.2, random_state = 0)

print(x_train2.shape)
print(y_train2.shape)
print(x_test2.shape)
print(y_test2.shape)
In [ ]:
# standardization

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train2 = sc.fit_transform(x_train2)
x_test2 = sc.transform(x_test2)
In [ ]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

model_o = RandomForestClassifier()
model_o.fit(x_train2, y_train2)

y_pred = model_o.predict(x_test2)

print("Training Accuracy: ", model_o.score(x_train2, y_train2))
print('Testing Accuarcy: ', model_o.score(x_test2, y_test2))

# confusion matrix
cm = confusion_matrix(y_test2, y_pred)
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(cm, annot = True, cmap = 'winter')
plt.show()

# classification report
cr = classification_report(y_test2, y_pred)
print(cr)

Model Explanation for Random Forest Classifier¶

In [ ]:
# let's check the importance of each attributes

from eli5.sklearn import PermutationImportance


perm = PermutationImportance(model, random_state = 0).fit(x_test, y_test)
eli5.show_weights(perm, feature_names = x_test.columns.tolist())
In [108]:
#!pip install pdpbox --user

from pdpbox import pdp, info_plots #for partial plots

base_features = x_train.columns.values.tolist()

feat_name = 'incident_severity'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
In [94]:
from pdpbox import pdp, info_plots #for partial plots

base_features = x_train.columns.values.tolist()

feat_name = 'collision_type'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
In [95]:
from pdpbox import pdp, info_plots #for partial plots

base_features = x_train.columns.values.tolist()

feat_name = 'incident_severity'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
In [96]:
from pdpbox import pdp, info_plots #for partial plots

base_features = x_train.columns.values.tolist()

feat_name = 'insured_zip'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
In [97]:
from pdpbox import pdp, info_plots #for partial plots

base_features = x_train.columns.values.tolist()

feat_name = 'age'
pdp_dist = pdp.pdp_isolate(model=model, dataset=x_test, model_features = base_features, feature = feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
In [110]:
# let's see the shap values

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x_test)

shap.summary_plot(shap_values[1], x_test, plot_type="bar")
In [111]:
shap.summary_plot(shap_values[1], x_test)
In [112]:
# let's create a function to check the patient's conditions

def fraud_analysis(model, fraud):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(fraud)
    shap.initjs()
    return shap.force_plot(explainer.expected_value[1], shap_values[1], fraud)
In [113]:
# let's do some real time prediction for patients

fraud = x_test.iloc[1,:].astype(float)
fraud_analysis(model, fraud)
Out[113]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [114]:
fraud = x_test.iloc[2,:].astype(float)
fraud_analysis(model, fraud)
Out[114]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [115]:
fraud = x_test.iloc[3,:].astype(float)
fraud_analysis(model, fraud)
Out[115]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [116]:
fraud = x_test.iloc[4,:].astype(float)
fraud_analysis(model, fraud)
Out[116]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [117]:
fraud = x_test.iloc[5,:].astype(float)
fraud_analysis(model, fraud)
Out[117]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [118]:
shap_values = explainer.shap_values(x_train.iloc[:50])
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], x_test.iloc[:50])
Out[118]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [ ]: